import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing,neighbors
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans,AgglomerativeClustering
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix,precision_score,recall_score,f1_score,r2_score,roc_curve,roc_auc_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import tree,linear_model
from sklearn import svm
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import model_selection
from matplotlib.legend_handler import HandlerLine2D
import plotly.graph_objects as go
from collections import Counter
import operator
#data = pd.read_csv("/content/drive/Shared drives/IST-707-project/rotten_tomatoes_movies.csv")
data = pd.read_csv("./rotten_tomatoes_movies.csv")
seed=77
df=data.copy()
# Codeblock
data.head(2)
data.shape
data.describe()
Replacing Audience Rating by median
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
class ColumnwiseMedianImputer(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self._median_imputed_cols = cols
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
numeric_data = x[self._median_imputed_cols]
numeric_data = numeric_data.fillna(numeric_data.median())
for col in numeric_data.columns:
x[col] = numeric_data[col]
return x
pipe = Pipeline([
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count','runtime_in_minutes','audience_rating'])),
])
result = pipe.transform(df)
result.isnull().sum()
Dropping Critic Concensus as it is a redundant column
Heatmap of correlation matrix which gives us an intuitive sense of correlation between variables
import seaborn as sns
plt.clf()
fig=plt.figure(figsize=(20,5))
sns.heatmap(result.corr().abs(),linecolor='white',linewidths=8)
Directors with the highest audience ratings
result['directors'].count()
result[['directors','audience_rating']] \
.groupby('directors',as_index=False) \
.mean() \
.sort_values(by='audience_rating',ascending=False) \
.head(5)
It does not however have an intutive sense as an established director does not make a single movie .It can be an outlier which can be a serious problem
FIltering by directors that have made more than 8 movies
filtered=result.groupby('directors').filter(lambda x: len(x)>=8)
filtered.shape
Most Reverred Directors according to audience across the Globe
fig, ax =plt.subplots(figsize=(20,6),dpi=90,facecolor='w',edgecolor='k')
best=filtered[['directors','audience_rating']] \
.groupby('directors',as_index=False) \
.mean() \
.sort_values(by='audience_rating',ascending=False) \
.head(9)
worst=filtered[['directors','audience_rating']] \
.groupby('directors',as_index=False) \
.mean() \
.sort_values(by='audience_rating',ascending=True) \
.head(9)
polarity = best.append(worst)
sns.barplot(x='audience_rating',y='directors',data=polarity,orient='h') \
.set_title('Rating of Directors', fontSize='23')
plt.tight_layout()
Filtering by genres that have more than 20 movies
filtered_genre = result.groupby('genre').filter(lambda x: len(x)>=20)
genera_rating_highest = filtered_genre[['genre','audience_rating']] \
.groupby('genre',as_index=False) \
.mean() \
.sort_values(by='audience_rating',ascending=False) \
.head(7)
fig=plt.figure(figsize=(20,6),dpi=90,facecolor='w',edgecolor='k')
sns.barplot(y='genre',x= 'audience_rating',data=genera_rating_highest) \
.set_title('Genres with the highest Ratings', fontSize=23)
import matplotlib as mpl
fig=plt.figure(figsize=(25,6),dpi=90,facecolor='w',edgecolor='k')
cmap = sns.light_palette('purple', as_cmap=True)
ax = sns.countplot(x="genre", hue="audience_rating", data=result,
palette='Blues', orientation='vertical',
order=result.genre.value_counts(ascending=False).iloc[:7].index)
ax.legend_.remove()
plt.title('The Distribution of Each genre', fontSize='20')
ax.set_xlabel(ax.get_xlabel(), fontSize=25)
ax.set_ylabel(ax.get_ylabel(), fontSize=25)
norm = mpl.colors.Normalize(vmin=0,vmax=2)
sm = plt.cm.ScalarMappable(cmap='Blues', norm=norm)
sm.set_array([ax])
plt.colorbar(sm)
# import plotly.graph_objs as go
# trace = go.Histogram(x=result[result.genre == 'Drama'],xbins=1),marker=dict(color='rgb(0, 0, 100)')
# layout = go.Layout(
# title="Histogram Frequency Counts"
# )
# fig = go.Figure(data=go.Data([trace]), layout=layout)
# py.iplot(fig, filename='histogram-freq-counts')
# fig, ax=plt.subplots(2, 1, figsize=(25,6))
# sns.distplot(data['audience_rating'], ax=ax[0])
# ax[0].axvline(df['audience_rating'].mean(),color='r')
# ax[0].axvline(df['audience_rating'].median(),color='g')
# ax[0].set_title('The Distribution of Audiance Rating', fontSize=23)
# sns.distplot(data['tomatometer_rating'], ax=ax[1])
# ax[1].axvline(df['tomatometer_rating'].mean(),color='r')
# ax[1].axvline(df['tomatometer_rating'].median(),color='g')
# ax[1].set_title('The Distribution of Tomatometer Rating', fontSize=23)
# plt.tight_layout()
data[['audience_rating','tomatometer_rating']]
import plotly.figure_factory as ff
data = data.dropna()
group_labels = data[['audience_rating','tomatometer_rating']].columns
fig = ff.create_distplot([data['audience_rating'],data['tomatometer_rating']], group_labels, bin_size=1)
fig.show()
class FeaturePicker(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self._cols = cols
@property
def cols(self):
return self._cols
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
return x[self._cols]
time_series_pipe = Pipeline([
('feature_picker', FeaturePicker(['in_theaters_date','on_streaming_date', 'tomatometer_status','audience_rating','tomatometer_rating'])),
])
ts_data = pipe.transform(data)
class TomatometerStatusDiscritizer(BaseEstimator, TransformerMixin):
def __init__(self, mapping):
self._mapping = mapping
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
x['tomatometer_status'] = x['tomatometer_status'].map(self._mapping)
return x
time_series_pipe = Pipeline([
('feature_picker', FeaturePicker(['in_theaters_date','on_streaming_date', 'tomatometer_status','audience_rating','tomatometer_rating'])),
('tomatometer_status_discritizer', TomatometerStatusDiscritizer({'Certified Fresh':3,'Fresh':2,'Rotten':1})),
])
ts_data = pipe.transform(data)
class TimeseriesMiscCleaner(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
x = x.dropna()
x = x.set_index(x['in_theaters_date'])
x['in_theaters_date'] = pd.to_datetime(x['in_theaters_date'])
x['on_streaming_date'] = pd.to_datetime(x['on_streaming_date'])
return x
time_series_pipe = Pipeline([
('feature_picker', FeaturePicker(['in_theaters_date','on_streaming_date', 'tomatometer_status','audience_rating','tomatometer_rating'])),
('tomatometer_status_discritizer', TomatometerStatusDiscritizer({'Certified Fresh':3,'Fresh':2,'Rotten':1})),
('timeseries_misc_cleaner', TimeseriesMiscCleaner()),
])
ts_data = time_series_pipe.transform(data)
ts_data.head(5)
ts_data_rotten = ts_data[ts_data.tomatometer_status == 1]
ts_data_fresh = ts_data[ts_data.tomatometer_status == 3]
import plotly.express as px
ts_data_rotten_plot = round((ts_data_rotten.groupby([pd.Grouper(key='in_theaters_date', freq='Y')]).count()/ts_data.groupby([pd.Grouper(key='in_theaters_date', freq='Y')]).count().replace(0, 1))*100,2)
ts_data_rotten_plot.dropna()
ts_data_fresh_plot = round((ts_data_fresh.groupby([pd.Grouper(key='in_theaters_date', freq='Y')]).count()/ts_data.groupby([pd.Grouper(key='in_theaters_date', freq='Y')]).count().replace(0, 1))*100,2)
ts_data_fresh_plot.dropna()
ts_data_plot = ts_data.groupby([pd.Grouper(key='in_theaters_date', freq='Y')]).count()
ts_audience_plot=ts_data[['audience_rating','in_theaters_date']].groupby([pd.Grouper(key='in_theaters_date', freq='Y')]).mean()
ts_tomato_plot=ts_data[['tomatometer_rating','in_theaters_date']].groupby([pd.Grouper(key='in_theaters_date', freq='Y')]).mean()
fig = px.line(ts_data_plot, x=ts_data_plot.index, y="tomatometer_status", title='Total Movies Released')
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=ts_data_fresh_plot.index, y=ts_data_fresh_plot["tomatometer_status"],
mode='lines',
name='Ratio of Good Movies over the Years'))
fig.add_trace(go.Scatter(x=ts_data_rotten_plot.index, y=ts_data_rotten_plot["tomatometer_status"],
mode='lines+markers',
name='Ratio of Bad Movies over the Years'))
fig.update_layout(title='Ratio of Movies over the Years',
xaxis_title='Year',
yaxis_title='Ratio (%)')
fig.show()
ts_data_dummy = pd.get_dummies(ts_data['tomatometer_status'],prefix='tomatometer_status')
ts_df = pd.concat([ts_data,ts_data_dummy], axis=1)
ts_df_plot = ts_df.groupby([pd.Grouper(key='in_theaters_date', freq='Y')]).sum()
ts_df_plot.eval('tomatometer_status_total = tomatometer_status_1 + tomatometer_status_2 + tomatometer_status_3',inplace=True)
fig = go.Figure()
fig.add_trace(go.Scatter(x=ts_df_plot.index, y=ts_df_plot.tomatometer_status_1 / ts_df_plot.tomatometer_status_total * 100,
mode='lines',
name='The Percentage of Rotten Movies'))
fig.add_trace(go.Scatter(x=ts_df_plot.index, y=ts_df_plot.tomatometer_status_2 / ts_df_plot.tomatometer_status_total * 100,
mode='lines+markers',
name='The Percentage of Fresh Movies'))
fig.add_trace(go.Scatter(x=ts_df_plot.index, y=ts_df_plot.tomatometer_status_3 / ts_df_plot.tomatometer_status_total * 100,
mode='lines', name='The Percentage of CFresh Movies'))
fig.update_layout(title='The Percentage Status of Movies over the Years',
xaxis_title='Year',
yaxis_title='Percentage (%)')
fig.show()
def cast_dict(status):
df_status = df[df.tomatometer_status == status]
list_status = [df_status.cast[i] for i in df_status.index if i != '']
list_status = str(list_status).split(',')
status_dict = Counter(list_status)
return status_dict
Cfresh_dict = cast_dict('Certified Fresh')
fresh_dict = cast_dict('Fresh')
rotten_dict = cast_dict('Rotten')
cast_fresh = Cfresh_dict.copy()
cast_fresh.update(fresh_dict)
cast_fresh_sorted = sorted(cast_fresh.items(), key=operator.itemgetter(1),reverse=True)
names=[cast_fresh_sorted[i][0] for i in range(2,12)]
count=[cast_fresh_sorted[i][1] for i in range(2,12)]
plt.figure(dpi=300)
sns.barplot(y=names,x=count,palette="GnBu_d").set_title('Actors that appear in Hit movies')
cast_rotten_sorted = sorted(rotten_dict.items(), key=operator.itemgetter(1),reverse=True)
cast_rotten_sorted[:11]
names=[cast_rotten_sorted[i][0] for i in range(0,11) if i !=1]
count=[cast_rotten_sorted[i][1] for i in range(0,11) if i !=1]
plt.figure(dpi=300)
sns.barplot(y=names,x=count,palette="GnBu_d").set_title('Actors that appear in rotten movies')
def studio(status):
df_status = data[data.tomatometer_status == status]
status_studio = Counter(df_status.studio_name.tolist())
status_studio = sorted(status_studio.items(), key=operator.itemgetter(1),reverse=True)
df_studio = pd.DataFrame(status_studio, columns=['Studio_name', status]).set_index('Studio_name')
return df_studio
Cfresh_studio = studio('Certified Fresh')
fresh_studio = studio('Fresh')
rotten_studio = studio('Rotten')
df_studio = pd.concat([Cfresh_studio,fresh_studio,rotten_studio], axis=1)
df_studio = df_studio.fillna(0)
fig = px.bar(df_studio.sort_values(by='Certified Fresh',ascending = False)[:10], y='Certified Fresh', x=df_studio.index[:10],title='The Top 10 Studios')
fig.show()
fig = px.bar(df_studio.sort_values(by='Rotten',ascending = False)[:10], y='Rotten', x=df_studio.index[:10],title='The Bottom 10 Studios')
fig.show()
def genre_plot(status):
df_status = data[data.tomatometer_status == status]
genre_status = [df_status.genre[i] for i in df_status.index if i != '']
data_list = str(genre_status).replace('&', ',')
data_list = str(data_list).replace('\'', '')
data_list = str(data_list).replace('[', ' ')
data_list = str(data_list).replace(']', '')
data_list = str(data_list).split(',')
data_dict = Counter(data_list)
for i,j in list(data_dict.items()):
if j<500:
data_dict['other'] +=j
del data_dict[i]
return data_dict
genre_Cfresh_dict = genre_plot('Certified Fresh')
genre_fresh_dict = genre_plot('Fresh')
genre_rotten_dict = genre_plot('Rotten')
genre_fresh = genre_Cfresh_dict.copy()
genre_fresh.update(genre_fresh_dict)
genre_fresh_sorted = sorted(genre_fresh.items(), key=operator.itemgetter(1),reverse=True)
genre_rotten_sorted = sorted(genre_rotten_dict.items(), key=operator.itemgetter(1),reverse=True)
genre_fresh_df = pd.DataFrame(genre_fresh_sorted, columns=['Genre', 'Counts'])
genre_rotten_df = pd.DataFrame(genre_rotten_sorted, columns=['Genre', 'Counts'])
fig = px.pie(genre_fresh_df, values='Counts', names='Genre')
fig.show()
fig = px.pie(genre_rotten_df, values='Counts', names='Genre')
fig.show()
def rating_plot(status):
df_status = data[data.tomatometer_status == status]
df_status.rating = df_status.rating.str.replace(')','')
status_rating = Counter(df_status.rating.tolist())
status_rating = sorted(status_rating.items(), key=operator.itemgetter(1),reverse=True)
status_rating = pd.DataFrame(status_rating, columns=['rating', status]).set_index('rating')
return status_rating
Cfresh_rating = rating_plot('Certified Fresh')
fresh_rating = rating_plot('Fresh')
rotten_rating = rating_plot('Rotten')
df_rating = pd.concat([Cfresh_rating,fresh_rating,rotten_rating], axis=1)
df_rating = df_rating.fillna(0)
df_rating['New_Fresh'] = df_rating["Certified Fresh"] + df_rating['Fresh']
df_rating
fig = px.pie(df_rating, values='New_Fresh', names=df_rating.index, title='The ratio of Rating in Fresh')
fig.show()
fig = px.pie(df_rating, values='Rotten', names=df_rating.index, title='The ratio of Rating in Rotten')
fig.show()
fig = px.pie(df_rating, values='Rotten', names=df_rating.index, title='The ratio of Rating in Rotten')
fig.show()
Data Preprocessing
df.rating.unique()
class parentheseprocessed(BaseEstimator, TransformerMixin):
def __init__(self, col):
self._parentheseprocessed_col = col
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
x[self._parentheseprocessed_col]=x[self._parentheseprocessed_col].str.replace(')','')
return x
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
])
result = pipe.transform(df)
class ColumnRemover(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self._remove_cols = cols
@property
def remove_cols(self):
return self._remove_cols
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
return x.drop(self._remove_cols,axis=1)
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus'])),
])
result = pipe.transform(df)
print(*result.columns, sep='\n')
# Codeblock
df.corr()
Dropping Rotten Tomatoes Link and poster_image URL as it is a redundant column
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
])
result = pipe.transform(df)
print(*result.columns, sep='\n')
df.corr()
Dropping columns where directors,studio and writers are not present
class MissRowsRemover(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self._affected_cols = cols
@property
def affected_cols(self):
return self._affected_cols
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
return x.dropna(subset=self._affected_cols)
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
])
result = pipe.transform(df)
result.shape
Grouping by Genre to find audience Rating
class GroupwiseMedianImputer(BaseEstimator, TransformerMixin):
def __init__(self, col_pairs):
self._col_pairs = col_pairs
@property
def col_pairs(self):
return self._col_pairs
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
for col, group in self._col_pairs.items():
x[col] = x.groupby(group)[col].apply(lambda _x:_x.fillna(_x.median()))
return x
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
])
result = pipe.transform(df)
Imputing the rest by Median
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
])
result = pipe.transform(df)
result.isnull().sum()
df.info()
class BinningBuilder(BaseEstimator, TransformerMixin):
def __init__(self, tar, dest, binnings, labels):
self._tar = tar
self._dest = dest
self._binnings = binnings
self._labels = labels
@property
def tar(self):
return self._tar
@property
def dest(self):
return self._dest
@property
def binning(self):
return self._binnings
@property
def labels(self):
return self._labels
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
x[self._dest] = pd.cut(x[self._tar], bins=self._binnings, labels=self._labels)
return x
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
])
result = pipe.transform(df)
result['audience_status'].isnull().sum()
class CustomizedOneHotEncoding(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self._cols = cols
@property
def cols(self):
return self._cols
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
for col in self._cols:
data_dum = pd.get_dummies(x[col],prefix=col)
for dummy_col in data_dum.columns:
x[dummy_col] = data_dum[dummy_col]
return x.drop(self._cols, axis=1)
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
('customized_one_hot_encoding', CustomizedOneHotEncoding(['audience_status'])),
])
result = pipe.transform(df)
result.columns
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
('customized_one_hot_encoding', CustomizedOneHotEncoding(['audience_status'])),
('feature_picker', FeaturePicker(['rating',
'runtime_in_minutes',
'tomatometer_status',
'audience_rating',
'audience_status_Spilled',
'audience_status_Upright',
'audience_count',
'audience_top_critics_count',
'audience_fresh_critics_count',
'audience_rotten_critics_count'])),
])
result = pipe.transform(df)
Checking the data types of columns
result.dtypes
Removing Duplicate values
class DuplicationRemover(BaseEstimator, TransformerMixin):
def __init__(self, subset_cols):
self._subset_cols = subset_cols
@property
def subset_cols(self):
return self._subset_cols
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
return x.drop_duplicates(subset=self._subset_cols,keep=False)
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
('customized_one_hot_encoding', CustomizedOneHotEncoding(['audience_status'])),
('feature_picker', FeaturePicker(['rating',
'runtime_in_minutes',
'tomatometer_status',
'audience_rating',
'audience_status_Spilled',
'audience_status_Upright',
'audience_count',
'audience_top_critics_count',
'audience_fresh_critics_count',
'audience_rotten_critics_count'])),
('duplication_remover', DuplicationRemover(["rating",
"runtime_in_minutes",
"tomatometer_status",
"audience_rating",
"audience_status_Spilled",
"audience_status_Upright",
"audience_count",
"audience_top_critics_count",
"audience_fresh_critics_count",
"audience_rotten_critics_count"])),
])
result = pipe.transform(df)
result.isnull().sum()
df.shape
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count', 'audience_rating'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
('customized_one_hot_encoding', CustomizedOneHotEncoding(['audience_status'])),
('feature_picker', FeaturePicker(['rating',
'runtime_in_minutes',
'tomatometer_status',
'audience_rating',
'audience_status_Spilled',
'audience_status_Upright',
'audience_count',
'audience_top_critics_count',
'audience_fresh_critics_count',
'audience_rotten_critics_count'])),
('duplication_remover', DuplicationRemover(["rating",
"runtime_in_minutes",
"tomatometer_status",
"audience_rating",
"audience_status_Spilled",
"audience_status_Upright",
"audience_count",
"audience_top_critics_count",
"audience_fresh_critics_count",
"audience_rotten_critics_count"])),
('customized_one_hot_encoding_2', CustomizedOneHotEncoding(['rating'])),
])
result = pipe.transform(df)
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count', 'audience_rating', 'runtime_in_minutes'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
('customized_one_hot_encoding', CustomizedOneHotEncoding(['audience_status'])),
('feature_picker', FeaturePicker(['rating',
'runtime_in_minutes',
'tomatometer_status',
'audience_rating',
'audience_status_Spilled',
'audience_status_Upright',
'audience_count',
'audience_top_critics_count',
'audience_fresh_critics_count',
'audience_rotten_critics_count'])),
('duplication_remover', DuplicationRemover(["rating",
"runtime_in_minutes",
"tomatometer_status",
"audience_rating",
"audience_status_Spilled",
"audience_status_Upright",
"audience_count",
"audience_top_critics_count",
"audience_fresh_critics_count",
"audience_rotten_critics_count"])),
('customized_one_hot_encoding_rating', CustomizedOneHotEncoding(['rating'])),
])
result = pipe.transform(df)
class CustomizedTomatomesterStatusEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
x.loc[x['tomatometer_status']==2,'tomatometer_status']=1
return x
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count', 'audience_rating', 'runtime_in_minutes'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
('customized_one_hot_encoding', CustomizedOneHotEncoding(['audience_status'])),
('feature_picker', FeaturePicker(['rating',
'runtime_in_minutes',
'tomatometer_status',
'audience_rating',
'audience_status_Spilled',
'audience_status_Upright',
'audience_count',
'audience_top_critics_count',
'audience_fresh_critics_count',
'audience_rotten_critics_count'])),
('duplication_remover', DuplicationRemover(["rating",
"runtime_in_minutes",
"tomatometer_status",
"audience_rating",
"audience_status_Spilled",
"audience_status_Upright",
"audience_count",
"audience_top_critics_count",
"audience_fresh_critics_count",
"audience_rotten_critics_count"])),
('customized_one_hot_encoding_rating', CustomizedOneHotEncoding(['rating'])),
('tomatometer_status_discritizer', TomatometerStatusDiscritizer({'Certified Fresh':2,'Fresh':1,'Rotten':0})),
('customized_tomatomester_status_encoder', CustomizedTomatomesterStatusEncoder()),
])
result = pipe.transform(df)
# Codeblock
# Codeblock
# Codeblock
# Codeblock
import gc
import multiprocessing
cpu_cnt = multiprocessing.cpu_count()
allocated_cpu = cpu_cnt
print(f"Allocated {allocated_cpu} CPUs")
gc.collect()
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count', 'audience_rating', 'runtime_in_minutes'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
('customized_one_hot_encoding', CustomizedOneHotEncoding(['audience_status'])),
('feature_picker', FeaturePicker(['rating',
'runtime_in_minutes',
'tomatometer_status',
'audience_rating',
'audience_status_Spilled',
'audience_status_Upright',
'audience_count',
'audience_top_critics_count',
'audience_fresh_critics_count',
'audience_rotten_critics_count'])),
('duplication_remover', DuplicationRemover(["rating",
"runtime_in_minutes",
"tomatometer_status",
"audience_rating",
"audience_status_Spilled",
"audience_status_Upright",
"audience_count",
"audience_top_critics_count",
"audience_fresh_critics_count",
"audience_rotten_critics_count"])),
('customized_one_hot_encoding_rating', CustomizedOneHotEncoding(['rating'])),
('tomatometer_status_discritizer', TomatometerStatusDiscritizer({'Certified Fresh':2,'Fresh':1,'Rotten':0})),
('customized_tomatomester_status_encoder', CustomizedTomatomesterStatusEncoder()),
])
result = pipe.transform(df)
pipe = Pipeline([
('parentheseprocessed', parentheseprocessed('rating')),
('numerical_median_imputer', ColumnwiseMedianImputer(['audience_count', 'audience_rating', 'runtime_in_minutes'])),
('column_remover', ColumnRemover(['critics_consensus', 'rotten_tomatoes_link', 'poster_image_url'])),
('missing_rows_remover', MissRowsRemover(['directors','writers','studio_name'])),
('groupwise_median_imputer', GroupwiseMedianImputer({'audience_rating' : 'genre'})),
('numerical_median_imputer_2', ColumnwiseMedianImputer(['audience_rating'])),
('binning_builder', BinningBuilder('audience_rating', 'audience_status', [-10, 50, 101], ['Spilled', 'Upright'])),
('customized_one_hot_encoding', CustomizedOneHotEncoding(['audience_status'])),
('feature_picker', FeaturePicker(['rating',
'runtime_in_minutes',
'tomatometer_status',
'audience_rating',
'audience_status_Spilled',
'audience_status_Upright',
'audience_count',
'audience_top_critics_count',
'audience_fresh_critics_count',
'audience_rotten_critics_count'])),
('duplication_remover', DuplicationRemover(["rating",
"runtime_in_minutes",
"tomatometer_status",
"audience_rating",
"audience_status_Spilled",
"audience_status_Upright",
"audience_count",
"audience_top_critics_count",
"audience_fresh_critics_count",
"audience_rotten_critics_count"])),
('customized_one_hot_encoding_rating', CustomizedOneHotEncoding(['rating'])),
('tomatometer_status_discritizer', TomatometerStatusDiscritizer({'Certified Fresh':2,'Fresh':1,'Rotten':0})),
('customized_tomatomester_status_encoder', CustomizedTomatomesterStatusEncoder()),
])
result = pipe.transform(df)
class IdentityTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
return x
def split_IV_DV(df, preprocessing_pipeline=IdentityTransformer()):
X = df.drop('tomatometer_status', axis=1)
y = df['tomatometer_status']
X = preprocessing_pipeline.fit(X).transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=seed)
return (X_train, X_test, y_train, y_test)
On the basis of many explanatory variables we make a binary classification of whether a subscriber will like the movie or not.
# Codeblock
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import pickle
pipe =make_pipeline(StandardScaler())
X_train, X_test,y_train, y_test = split_IV_DV(result, pipe)
hyperparameters={'penalty': ['l1','l2'],
'solver': ['liblinear','saga'],
'C':[0.001,0.0001],
'max_iter':[1000]}
logistic=LogisticRegression()
cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
logit_gs = GridSearchCV(estimator=logistic,param_grid=hyperparameters,cv=cv,verbose=0)
logit_gs.fit(X_train,y_train)
filename = 'finalized_logistic_model_C_1_L2.sav'
pickle.dump(logit_gs,open(filename,'wb'))
# load the model from disk
print (logit_gs.best_score_)
logit_best=logit_gs.best_estimator_
y_pred=logit_gs.predict(X_test)
logit_best.score(X_train, y_train)
lg_report = pd.DataFrame(classification_report(y_pred,y_test,output_dict=True)).transpose()
print(f"Precision:{round (metrics.precision_score(y_pred,y_test)*100)}%")
print(f"Recall : {round(metrics.recall_score(y_pred,y_test)*100)}%")
print(f"Accuracy: {round (metrics.accuracy_score(y_pred,y_test)*100)}%")
print(classification_report(y_pred,y_test))
ROC-Curve Logistic
# generate a no skill prediction (majority class)
noskill_prob = [0 for _ in range(len(X_test))]
# predict probabilities
logit_prob = logit_best.predict_proba(X_test)
logit_prob = logit_prob[:,1]
# calculating scores
noskill_auc = roc_auc_score(y_test,noskill_prob)
logit_auc = roc_auc_score(y_test,logit_prob,average='macro')
#Printing the summarized scores
print('No Skill: ROC AUC=%.3f' % (noskill_auc))
print('Logistic Regression : ROC AUC=%.3f' % (logit_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, noskill_prob)
logit_fpr, logit_tpr, _ = roc_curve(y_test,logit_prob)
# plot the roc curve for the model
plt.figure(figsize=(16,8),dpi=300)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(logit_fpr, logit_tpr, marker='.', label='Logistic Regression ')
# # axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# # show the legend
plt.legend()
# # show the plot
plt.show()
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestRegressor
import pickle
n_estimators = number of trees in the foreset
max_features = max number of features considered for splitting a node
# Codeblock
cv = KFold(n_splits=3, shuffle=True)
X_train, X_test,y_train, y_test = split_IV_DV(result, pipe)
rf=RandomForestClassifier()
params={'n_estimators': range(80, 140,20),
'max_depth':range(3, 6),
'max_features': range(3, 5),
'criterion':['entropy']}
gs_rf=GridSearchCV(estimator=rf,param_grid=params,scoring="accuracy",n_jobs=-1,verbose=0)
gs_rf.fit(X_train,y_train)
filename = 'finalized_rf_model_estimator_150_no_criterion.sav'
pickle.dump(gs_rf,open(filename,'wb'))
print (gs_rf.best_score_)
rf_best=gs_rf.best_estimator_
y_pred=rf_best.predict(X_test)
rf_best.score(X_train, y_train)
rf_best.get_params
X_train, X_test,y_train, y_test = split_IV_DV(result)
rf_feature = pd.DataFrame(zip(X_train.columns,rf_best.feature_importances_),columns=['features', 'importance'])
rf_feature.sort_values(by='importance', ascending = False)
rf_report = pd.DataFrame(classification_report(y_pred,y_test,output_dict=True)).transpose()
print(f"Precision Score:{round (metrics.precision_score(y_test,y_pred) *100)}%")
print(f"Recall Score:{round (metrics.recall_score(y_test,y_pred) *100)}%")
print(f"Accuracy Score:{round (metrics.accuracy_score(y_test,y_pred) *100)}%")
print(classification_report(y_test,y_pred))
# generate a no skill prediction (majority class)
noskill_prob = [0 for _ in range(len(y_test))]
# predict probabilities
rf_prob = rf_best.predict_proba(X_test)
rf_prob = rf_prob[:,1]
# calculating scores
noskill_auc = roc_auc_score(y_test,noskill_prob)
rf_auc = roc_auc_score(y_test,rf_prob,average='macro')
#Printing the summarized scores
print('No Skill: ROC AUC=%.3f' % (noskill_auc))
print('Random Forest: ROC AUC=%.3f' % (rf_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, noskill_prob)
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_prob)
# plot the roc curve for the model
plt.figure(figsize=(16,8),dpi=300)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(rf_fpr, rf_tpr, marker='.', label='Random Forest')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
X_train, X_test,y_train, y_test = split_IV_DV(result, pipe)
cv=KFold(n_splits=10,shuffle=True,random_state=True)
# params={
# 'learning_rate' : [1],
# 'min_samples_leaf':[5],
# 'max_features':['auto']
# }
params={'learning_rate': np.arange(0.01, 0.09, 0.01),
'n_estimators': range(10,50,10),
'max_features':range(1, 3),
}
gbm=GradientBoostingClassifier()
grid_search_gb= GridSearchCV(estimator=gbm, param_grid=params,scoring="accuracy",n_jobs=-1,cv=cv,verbose=0)
grid_search_gb.fit(X_train,y_train)
filename=("finalized_gb_learning_rate_1_min_leaf_5.sav")
pickle.dump(grid_search_gb,open(filename,"wb"))
for hps,values in grid_search_gb.best_params_.items():
print(f"{hps,values}")
gb_best=grid_search_gb.best_estimator_
gb_best.fit(X_train,y_train)
y_pred=gb_best.predict(X_test)
print (grid_search_gb.best_score_)
gb_best.score(X_train, y_train)
print(f"Precision Score:{round (metrics.precision_score(y_test,y_pred,average='micro') *100)}%")
print(f"Recall Score:{round (metrics.recall_score(y_test,y_pred,average='micro') *100)}%")
print(f"Accuracy Score:{round (metrics.accuracy_score(y_test,y_pred) *100)}%")
print(classification_report(y_test,y_pred))
gbt_report = pd.DataFrame(classification_report(y_pred,y_test,output_dict=True)).transpose()
# generate a no skill prediction (majority class)
noskill_prob = [0 for _ in range(len(y_test))]
# predict probabilities
gb_prob = gb_best.predict_proba(X_test)
gb_prob = gb_prob[:, 1]
# calculating scores
noskill_auc = roc_auc_score(y_test,noskill_prob)
gb_auc = roc_auc_score(y_test,gb_prob)
#Printing the summarized scores
print('No Skill: ROC AUC=%.3f' % (noskill_auc))
print('Gradient Boosting: ROC AUC=%.3f' % (gb_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, noskill_prob)
gb_fpr, gb_tpr, _ = roc_curve(y_test, gb_prob)
# plot the roc curve for the model
plt.figure(figsize=(16,8))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(gb_fpr, gb_tpr, marker='.', label='Gradient Boosting')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()
# from sklearn.model_selection import GridSearchCV
# param_grid = {'learning_rate': np.arange(0.01, 0.2, 0.05),
# 'n_estimators': range(50, 200,10),
# 'max_features':range(1, 10),
# 'max_depth': range(2, 5)}
# clf = GridSearchCV(gbm(), param_grid)
# clf.fit(X_train_std, y_train)
# print(f"Accuracy for best GBM: {round(clf.best_score_*100, 2)}%")
Grouping audience into different clusters for the purpose of targeting them with similar labels of movies, measuring accuracy of clustering classification as opposed to naive Bayes which computes the probabilistic score of the likelihood of watching a movie.
# Codeblock
pipe=Pipeline([('standardizer',StandardScaler()),])
X_train,X_test,y_train,y_test=split_IV_DV(result,pipe)
error=[]
for i in range(1,35):
knn=neighbors.KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred=knn.predict(X_test)
error.append(np.mean(y_pred!=pred))
plt.figure(figsize=(15,5))
plt.plot(range(1, 35), error, color='orange',linestyle='dashed',marker='o',
markerfacecolor='cyan', markersize=10)
plt.title("Error Rate for KNN")
plt.xlabel("K Value")
plt.ylabel("Mean Error")
Set p = 1 then it will use the Manhattan distance and p = 2 to be Euclidean.
param_grid={'weights': ['distance'],
'algorithm': ['brute'],
'p': [2],
}
knn=neighbors.KNeighborsClassifier(n_neighbors=3)
cv=StratifiedKFold(n_splits=5,shuffle=True)
grid_search_knn=GridSearchCV(estimator=knn,param_grid=param_grid,n_jobs=allocated_cpu,verbose=0,cv=cv)
grid_search_knn.fit(X_train,y_train)
filename=("finalized_knn_algo_brute_p_2.sav")
pickle.dump(grid_search_knn,open(filename,"wb"))
for hps,values in grid_search_knn.best_params_.items():
print(f"{hps,values}")
knn_best=grid_search_knn.best_estimator_
knn_best.fit(X_train,y_train)
print(y_test.shape)
y_pred=knn_best.predict(X_test)
knn_report = pd.DataFrame(classification_report(y_pred,y_test,output_dict=True)).transpose()
print(f"Precision:{round (metrics.precision_score(y_pred,y_test)*100)}%")
print(f"Recall : {round(metrics.recall_score(y_pred,y_test)*100)}%")
print(f"Accuracy : {round(metrics.accuracy_score(y_pred,y_test)*100)}%")
print(classification_report(y_pred,y_test))
# generate a no skill prediction (majority class)
noskill_prob = [0 for _ in range(len(X_test))]
# predict probabilities
knn_prob = knn_best.predict_proba(X_test)
knn_prob = knn_prob[:, 1]
# calculating scores
noskill_auc = roc_auc_score(y_test,noskill_prob)
knn_auc = roc_auc_score(y_test, knn_prob)
# Printing the summarized scores
print('No Skill: ROC AUC=%.3f' % (noskill_auc))
print('Knn : ROC AUC=%.3f' % (knn_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, noskill_prob)
knn_fpr, knn_tpr, _ = roc_curve(y_test, knn_prob)
# plot the roc curve for the model
plt.figure(figsize=(16,8))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(knn_fpr, knn_tpr, marker='.', label='KNN')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()
Assigning values to a new data frame
df=data.copy()
df_association=df.copy()
df_association.head(3)
df_association.drop(['cast','critics_consensus','movie_title','rotten_tomatoes_link','movie_info','poster_image_url'],axis=1,inplace=True)
Binning Categorical Variables on the basis of Tomatometer Ratings and Audience Count
print(df_association['audience_top_critics_count'].min())
print(df_association['audience_top_critics_count'].max())
print(df_association['audience_top_critics_count'].median())
df_association['audience_top_critics_count'].describe()
labels=('Disliked_Genre','Watchable_Genre','Famous_Genre')
df_association['genre_label']=pd.cut(df_association['audience_top_critics_count'],bins=(0,13,37,64),labels=labels)
labels=('Dull_Studio','Regular_Studio','Famous _Studio')
df_association['studio_label']=pd.cut(df_association['audience_top_critics_count'],bins=(0,13,37,64),labels=labels)
The Longest movie ever made ran a total of 773 minutes Thus we need to winsorize the column
df_association['runtime_in_minutes']=df_association['runtime_in_minutes'].clip(upper=773)
print(df_association['runtime_in_minutes'].describe().loc['min'])
print(df_association['runtime_in_minutes'].describe().loc['25%'])
print(df_association['runtime_in_minutes'].describe().loc['50%'])
print(df_association['runtime_in_minutes'].describe().loc['75%'])
print(df_association['runtime_in_minutes'].describe().loc['max'])
Discretizing Variables on the basis of rotten tomato rating Binning Writers Directors and Studio Labels on the bais of audeince ratings
labels=('Rotten_Director','Regular_Director','Famous_Director')
df_association['Director_Status']=pd.qcut(df_association['audience_top_critics_count'],labels=labels,q=3)
labels=('Rotten_Writer','Regular_Writer','Famous_Writer')
df_association['Writer_status']=pd.qcut(df_association['audience_top_critics_count'],labels=labels,q=3)
Assuming that an averge movie runs from 1 to 3 hours
labels=('Short_Runtime','Average_Time','Long_Runtime')
df_association['movie_runtime']=pd.cut(df_association['runtime_in_minutes'],bins=(0,60,180,773),labels=labels)
print(df_association['audience_count'].describe().loc['min'])
print(df_association['audience_count'].describe().loc['25%'])
print(df_association['audience_count'].describe().loc['50%'])
print(df_association['audience_count'].describe().loc['75%'])
print(df_association['audience_count'].describe().loc['max'])
labels=('Rotten_Count','Regular_Count','Filled_Count')
df_association['audience_count_label']=pd.qcut(df_association['audience_count'],q=3,labels=labels)
labels=('Rotten_Count','Regular_Count','Filled_Count')
df_association['audience_count_label']=pd.qcut(df_association['audience_count'],q=3,labels=labels)
df_association.drop(['genre','directors','writers','in_theaters_date','on_streaming_date','studio_name','audience_count'],axis=1,inplace=True)
def create_dummies(col_name):
dumdf=pd.get_dummies(df_association[col_name])
dumdf.columns=[col_name+"_"+str(col) for col in dumdf.columns]
return dumdf
df_association.dtypes
df_association['genre_label']=df_association['genre_label'].astype(object)
df_association['studio_label']=df_association['studio_label'].astype(object)
df_association['Director_Status']=df_association['Director_Status'].astype(object)
df_association['Writer_status']=df_association['Writer_status'].astype(object)
df_association['movie_runtime']=df_association['movie_runtime'].astype(object)
ratingDum=create_dummies('rating')
dumdf_1=df_association.drop('rating',axis=1).join(ratingDum)
tomatometer_statusDum=create_dummies('tomatometer_status')
dumdf_1=dumdf_1.drop('tomatometer_status',axis=1).join(tomatometer_statusDum)
audience_statusDum=create_dummies('audience_status')
dumdf_1=dumdf_1.drop('audience_status',axis=1).join(audience_statusDum)
genre_labelDum=create_dummies('genre_label')
dumdf_1=dumdf_1.drop('genre_label',axis=1).join(genre_labelDum)
studio_labelDum=create_dummies('studio_label')
dumdf_1=dumdf_1.drop('studio_label',axis=1).join(studio_labelDum)
Director_StatusDum=create_dummies('Director_Status')
dumdf_1=dumdf_1.drop('Director_Status',axis=1).join(Director_StatusDum)
Writer_statusDum=create_dummies('Writer_status')
dumdf_1=dumdf_1.drop('Writer_status',axis=1).join(Writer_statusDum)
movie_runtimeDum=create_dummies('movie_runtime')
dumdf_1=dumdf_1.drop('movie_runtime',axis=1).join(movie_runtimeDum)
audience_countDum=create_dummies('audience_count_label')
dumdf_1=dumdf_1.drop('audience_count_label',axis=1).join(audience_countDum)
dumdf_1.head()
df_association.drop(['runtime_in_minutes'],axis=1,inplace=True)
df_association.drop(['audience_fresh_critics_count','audience_top_critics_count','audience_rotten_critics_count'],axis=1,inplace=True)
df_association.drop(['tomatometer_rating','tomatometer_count','audience_rating'],axis=1,inplace=True)
Preparing Data for association Rule Mining
# convert dataframe to string
df_association= df_association.astype('str')
# Strip whitespaces
df_association[df_association.columns] = df_association.apply(lambda x: x.str.strip())
#Convert to type object
df_association = df_association.astype('object')
categorical_data = df_association.select_dtypes(include=['object'])
categorical_data = categorical_data.apply(lambda col: col.apply(lambda element: f"{col.name}={element}"))
categorical_data.head(5)
# Create a list out of the dataframe
records = []
for i in range(0,len(categorical_data)):
records.append([str(categorical_data.values[i,j])
for j in range(0, len(categorical_data.columns))])
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
#initialize transaction encoder
te = TransactionEncoder()
# Transform records
te_ary = te.fit(records).transform(records)
#convert the transformed records to a dataframe
df2 = pd.DataFrame(te_ary, columns=te.columns_)
df2.head()
# Create frequent itemsets using apriori
frequent_itemsets = apriori(df2,use_colnames=True)
frequent_itemsets.sort_values(by='support',ascending=False).head(6)
#Create Association Rules
rules = association_rules(frequent_itemsets)
rules.sort_values(by = 'lift',ascending=False).head(10)
# Function to set consequents and run association rules
def SupervisedApriori(data,consequent,min_supp,min_conf,min_lift):
frequent_itemsets = apriori(data, min_supp, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_conf)
#filter according to lift
rules = rules[rules['lift'] > min_lift]
sup_rules = pd.DataFrame()
for i in consequent:
df = rules[rules['consequents'] == {i}]
sup_rules = sup_rules.append(df,ignore_index = True)
return(sup_rules)
The below rules are the 5 most interesting because they have the highest lift(values >1).
Rules for a movie to be a Certified Fresh Movie
For a movie to be critically succesfull it should be directed by a reverred director or a famous writer that is able to communicate his story
Also the audiences should enjoy the movie showing that audience and critic opinion match
movie_Cfresh = SupervisedApriori(df2,consequent = ['tomatometer_status=Certified Fresh'],
min_supp=0.11, min_conf=0.41, min_lift=1)
movie_Cfresh.sort_values(by="lift",ascending=False).head(10)
For a movie to be rotten it should be published by a disliked studio that cashes on on genres that might sell more but have a very strong dislike level.Studios with such infamous ppersonas tend to make R Rated movies which also leads to a movie being rotten
Also as stated earlier the audience status matches with the critic concensus as a result if dislike moviegoers it is also disliked the critic
movie_rotten = SupervisedApriori(df2,consequent = ['tomatometer_status=Rotten'],
min_supp=0.11, min_conf=0.7, min_lift=1)
movie_rotten.sort_values(by="lift",ascending=False).head(10)
Likelihood of watching a movie on the presumption that the subscriber has watched a similar class of movie and the prior probability of similar groups of audience liking the movie.
# Codeblock
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
gnb = GaussianNB()
import sklearn.metrics as metrics
mnb = MultinomialNB()
cv=StratifiedKFold(n_splits=10,shuffle=True)
pipe = Pipeline([
('standardizer', StandardScaler()),
])
X_train, X_test,y_train, y_test = split_IV_DV(result, pipe)
params={
'var_smoothing': np.arange(1e-9,1).tolist()
}
grid_search_nb_g=GridSearchCV(estimator=gnb,cv=5,param_grid=params,scoring="accuracy")
grid_search_nb_g.fit(X_train, y_train)
filename = 'finalized_gnb.sav'
pickle.dump(grid_search_nb_g,open(filename,'wb'))
# load the model from disk
print (grid_search_nb_g.best_score_)
gnb_best=grid_search_nb_g.best_estimator_
y_pred=grid_search_nb_g.predict(X_test)
Predicting the polarity of reviews by mining text and using naive Bayes to predict the type and degree of polarity.
data.head(3)
data.movie_info[0]
#!pip install stanfordnlp
# Codeblock
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
data['critics_consensus'][1]
Removing Missing Values
data.critics_consensus.isnull().sum()
Copying Data to a Dataframe
sa=data.copy()
Removing rows that have null values
sa=sa.dropna()
sa.shape
sa.reset_index(inplace=True)
sa=sa['critics_consensus']
newdict={}
for sentence in sa:
ss = sid.polarity_scores(sentence)
pol = [ss[(k)] for k in sorted(ss)]
sentiment = [k for k in sorted(ss)]
sentiment=pd.DataFrame( pol,sentiment)
sentiment.columns=['Frequency']
sentiment['Polarity']=sentiment.index
(sentiment)
sentimentfig = px.pie(sentiment, values='Frequency', names='Polarity')
sentimentfig.show()
This means our Corpus was rated as 37% Positive, 62% Neutral and 0% Negative. Hence all these should add up to 1.
polarity=[sid.polarity_scores(sa[x]) for x in range(len(sa))]
sa=pd.DataFrame(sa)
polarity=pd.DataFrame(polarity)
Concatenating Polarity with Movie Critic Review
df_sa=pd.concat([sa,polarity],axis=1)
df_sa.head()
sub_df=df[['movie_title','directors','writers']]
Concatenating Movie Name along with director and writer who directed it to movie polarity
df_sa=pd.concat([sub_df,df_sa],axis=1)
df_sa.drop('critics_consensus',axis=1,inplace=True)
df_pos = df_sa[df_sa['pos']>0]
df_neg = df_sa[df_sa['neg']>0]
df_pos.head()
df_pos.sort_values(by='pos',ascending=False)['movie_title'].head(10)
df_pos=df_pos[df_pos['movie_title'].notna()]
df_neg=df_neg[df_neg['movie_title'].notna()]
fig = px.bar(df_pos.sort_values(by='pos',ascending = False)[:10],y='pos' ,x=df_pos.sort_values(by='pos',ascending=False)['movie_title'].head(10),title='Movies with the most postive polarity ')
fig.show()
fig = px.bar(df_neg.sort_values(by='neg',ascending = False)[:10],y='neg' ,
x=df_neg.sort_values(by='neg',ascending=False)['movie_title'].head(10),title='Movies with the most negative polarity ')
fig.show()
Positing the directors with the most polarity
df_pos_director = Counter(df_pos['directors'].tolist())
df_neg_director = Counter(df_neg['directors'].tolist())
df_total=Counter(df['directors'].tolist())
df_pos_director = sorted(df_pos_director.items(), key=operator.itemgetter(1),reverse=True)
df_neg_director = sorted(df_neg_director.items(), key=operator.itemgetter(1),reverse=True)
df_director= sorted(df_total.items(), key=operator.itemgetter(1),reverse=True)
df_pos_director = pd.DataFrame(df_pos_director, columns=['directors', 'Postive_Sentiments_Directors']).set_index('directors')
df_neg_director = pd.DataFrame(df_neg_director, columns=['directors', 'Negative_Sentiments_Directors']).set_index('directors')
df_director=pd.DataFrame(df_director, columns=['directors', 'Total']).set_index('directors')
Filtering by total movies made by 6 for a director to be recognized
df_director.groupby('directors').filter(lambda x: x > 5)
df_directors = pd.concat([df_pos_director,df_neg_director,df_director], axis=1)
df_directors = df_directors.fillna(0)
df_directors =df_directors.sort_values(by='Postive_Sentiments_Directors',ascending = False)
fig = px.bar(df_directors[1:11]
,y='Postive_Sentiments_Directors'
,x=df_directors[1:11].index
,title='The Top 10 Directors by Sentiments')
fig.show()
df_directors = df_directors.sort_values(by='Negative_Sentiments_Directors',ascending = False)
fig = px.bar(df_directors[1:11],
y='Negative_Sentiments_Directors'
, x=df_directors[1:11].index
,title='The Top 10 Most hated Directors by Sentiments')
fig.show()
Model_comparison= {'Model':['KNN','Logistic','Random_Forest','Gradient_Boosting'],
'Precision':[knn_report.iloc[1,0],lg_report.iloc[1,0],rf_report.iloc[1,0],gbt_report.iloc[1,0]],
'Recall':[knn_report.iloc[1,1],lg_report.iloc[1,1],rf_report.iloc[1,1],gbt_report.iloc[1,1]],
'Accuracy':[knn_report.iloc[2,1],lg_report.iloc[2,1],rf_report.iloc[2,1],gbt_report.iloc[2,1]],
'ROC-AUC Score':[knn_auc,logit_auc,rf_auc,gb_auc]
}
pd.DataFrame(Model_comparison)